creating ranks

the ranks will be a named num, with entrezgene_id as name and stat (Wald Test) as metric

getRanks <- function(res, annot) {
  # only taking genes which have entrezgene_ids assigned to them
  genes_with_entrez <- select(annot, GeneID, entrezgene_id) %>% 
    filter(!is.na(entrezgene_id))
  
  ranks <- as.data.frame(res) %>%
    tibble::rownames_to_column("GeneID") %>%
    merge(genes_with_entrez, by = "GeneID") %>%
    arrange(desc(stat)) %>% 
    select(entrezgene_id, stat) %>% 
    tibble::deframe() # creating a named num from two columns
  return(ranks)
}

ranks.gastroc <- getRanks(res.gastroc, annot)
ranks.soleus <- getRanks(res.soleus, annot)
# TODO: why (again) is the soleus gene count seemingly 300 below gastroc gene count / ranks count

ranks distribution

duplicate entrezgene_ids

duplicate entrezgene ids (multiple entrez are mapped to the same gene name)

Thus a quick look if any of these genes can be simply omitted. Like if the gene_type is “other” or “tRNA”

# TODO: do not use annot, but the merge? (actually used genes!)
# first looking at ENSEMBL:
duplicate_ENSEMBL <- annot[duplicated(annot$GeneID), ] %>%
  dplyr::group_by(GeneID, gene_biotype) %>%
  summarise(n = n())
## `summarise()` has grouped output by 'GeneID'. You can override using the `.groups` argument.
# looking at gene_names(ext):
duplicate_geneNames <- annot[duplicated(annot$external_gene_name), ] %>% 
  dplyr::group_by(GeneID, gene_biotype) %>%
  summarise(n = n())
## `summarise()` has grouped output by 'GeneID'. You can override using the `.groups` argument.
# duplicates have always the same gene_biotype! (in no more than one group occurs the same GeneID)
anyDuplicated(duplicate_ENSEMBL$GeneID)   # 0
## [1] 0
anyDuplicated(duplicate_geneNames$GeneID) # 0
## [1] 0
# are all gene_name duplicates in ENS duplicates?
sum(duplicate_ENSEMBL$GeneID %in% duplicate_geneNames$GeneID) %>%
  sprintf(
    "%d / %d ENSEMBL duplications are also included in the duplicated gene_names",
    .,
    nrow(duplicate_ENSEMBL)
  )
## [1] "173 / 173 ENSEMBL duplications are also included in the duplicated gene_names"
diff_ENS_gene <- nrow(duplicate_geneNames) - nrow(duplicate_ENSEMBL)

Since all ENSEMBL duplicates are also found in the gene_name duplicates, using the ENSEMBL as id for the ranks would reduce the number of duplicates by 107.

actually most of them are “protein coding” and will not be omitted.

loading pathways

Pathways are provided by http://www.gsea-msigdb.org/gsea/msigdb/mouse/collections.jsp

For now the Canonical pathways are used. These gene sets represent biological a biological process. They are composed from the following databases taking a subset of CP:

database gene sets
BioCarta 252
Reactome 1249
WikiPathways 186

applying fgsea

fgseaRes <- fgsea(
  pathways = CGP,
  stats    = ranks,
  minSize  = 15,
  maxSize  = 380
)

Enrichment score plot

ordering pathways by padj values and using ES to

# ' obtain top pathways ordered by padj and use `ES` for up or down regulation
get_top_pathways <- function(fgseaRes, up = TRUE, pCutoff=params$pCutoff, n=10) {
  .updown <- ifelse(up, `>`, `<`)
  
  top.pathways <- fgseaRes %>%
    filter(.updown(ES,0), padj < pCutoff) %>%
    arrange(padj) %>% 
    slice_head(n=n)
  
 return(top.pathways) 
}

plot for top up and down regulated pathways

# ' plots top n enrichment plots for the given fgsea result
plot_top_enrichment <- function(fgseaRes, pathways, ranks, n = 9, up = TRUE) {
  # extracting the top n pathways
  top.pathways <- get_top_pathways(fgseaRes, up=up, pCutoff=params$pCutoff, n=n)
  
  plot.list <- list()
  # lims <- list("x" = c(0,17000), "y" = c(-0.8,0.0))
  
  for (i in 1:nrow(top.pathways)) {
    # filling plot.list with enrichmentPlots 
    # TODO: how can I use facet_wrap for this?
    pathway <- top.pathways[i]$pathway
    plt <- plotEnrichment(pathways[[pathway]], ranks) +
      # TODO: adjust yaxis to the same scale
      # TODO: keep axis.text.x only on the lower row
      # TODO: keep axis.text.y only on the right column
      theme(
        axis.title.x = element_blank(),
        axis.title.y = element_blank()
      ) # +
      # coord_cartesian(xlim = lims$x, ylim = lims$y)
    plot.list[[i]] <- plt
  }
    
  arrange_plts(plot.list)
}

# ' helper function to arragen the plot from the enrichment
arrange_plts <- function(plt.list) {
  nplts <- length(plt.list)
  plt <- plt.list[1]
  xlab <- plt$labels$x
  ylab <- plt$labels$y
  
  # set axis to the same scale
  lims <- list("x" = c(0, 17000), "y" = c(-0.8, 0.0))
  
  # remove axis
  
  # arrange the plots
  fig_labels <- LETTERS[1:nplts]
  
  patchwork::wrap_plots(plt.list, )
  
  figure <- ggpubr::ggarrange(plotlist = plt.list,
                              labels = fig_labels) %>%
    annotate_figure(left = text_grob(ylab, rot = 90),
                    bottom = text_grob(xlab))
 
  # TODO: remove all x-axis labels except lower row
  # get dimensions
  figure$layers
   
  return(figure)
}



# plot_labels <-
#     data.frame("label" = LETTERS[1:10], "pathway" = top.pathways)
# knitr::kable(caption = "plot labels", plot_labels)

gastroc up

plot_top_enrichment(fgseaRes.gastroc, CGP, ranks.gastroc, up=T)

# TODO: add plot labels to return argument of plot_top_enrichment (use list probably)
plot_labels <-
    data.frame("label" = LETTERS[1:9], "pathway" = get_top_pathways(fgseaRes.gastroc, up=T, n=9)$pathway)
knitr::kable(caption = "plot labels", plot_labels)
plot labels
label pathway
A WP_TYROBP_CAUSAL_NETWORK_IN_MICROGLIA
B WP_MICROGLIA_PATHOGEN_PHAGOCYTOSIS_PATHWAY
C WP_APOPTOSIS
D WP_FIBRIN_COMPLEMENT_RECEPTOR_3_SIGNALING_PATHWAY
E REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL
F WP_CHEMOKINE_SIGNALING_PATHWAY
G BIOCARTA_TNFR2_PATHWAY
H REACTOME_GPCR_LIGAND_BINDING
I REACTOME_CLASS_A_1_RHODOPSIN_LIKE_RECEPTORS

gastroc down

plot_top_enrichment(fgseaRes.gastroc, CGP, ranks.gastroc, up=F)

plot_labels <-
    data.frame("label" = LETTERS[1:9], "pathway" = get_top_pathways(fgseaRes.gastroc, up=F, n=9)$pathway)
knitr::kable(caption = "plot labels", plot_labels)
plot labels
label pathway
A REACTOME_THE_CITRIC_ACID_TCA_CYCLE_AND_RESPIRATORY_ELECTRON_TRANSPORT
B REACTOME_RESPIRATORY_ELECTRON_TRANSPORT_ATP_SYNTHESIS_BY_CHEMIOSMOTIC_COUPLING_AND_HEAT_PRODUCTION_BY_UNCOUPLING_PROTEINS
C REACTOME_RESPIRATORY_ELECTRON_TRANSPORT
D REACTOME_ANTIGEN_PROCESSING_UBIQUITINATION_PROTEASOME_DEGRADATION
E WP_ELECTRON_TRANSPORT_CHAIN
F REACTOME_METABOLISM_OF_AMINO_ACIDS_AND_DERIVATIVES
G REACTOME_TRANSLATION
H REACTOME_NEDDYLATION
I REACTOME_COMPLEX_I_BIOGENESIS

soleus up

plot_top_enrichment(fgseaRes.soleus, CGP, ranks.soleus, up=T)

plot_labels <-
    data.frame("label" = LETTERS[1:9], "pathway" = get_top_pathways(fgseaRes.soleus, up=T, n=9)$pathway)
knitr::kable(caption = "plot labels", plot_labels)
plot labels
label pathway
A REACTOME_SRP_DEPENDENT_COTRANSLATIONAL_PROTEIN_TARGETING_TO_MEMBRANE
B REACTOME_FORMATION_OF_A_POOL_OF_FREE_40S_SUBUNITS
C WP_CYTOPLASMIC_RIBOSOMAL_PROTEINS
D REACTOME_NONSENSE_MEDIATED_DECAY_NMD_INDEPENDENT_OF_THE_EXON_JUNCTION_COMPLEX_EJC
E WP_TYROBP_CAUSAL_NETWORK_IN_MICROGLIA
F REACTOME_EUKARYOTIC_TRANSLATION_INITIATION
G REACTOME_NONSENSE_MEDIATED_DECAY_NMD
H REACTOME_MAJOR_PATHWAY_OF_RRNA_PROCESSING_IN_THE_NUCLEOLUS_AND_CYTOSOL
I REACTOME_INTERLEUKIN_7_SIGNALING

soleus down

plot_top_enrichment(fgseaRes.soleus, CGP, ranks.soleus, up=F)

plot_labels <-
    data.frame("label" = LETTERS[1:9], "pathway" = get_top_pathways(fgseaRes.soleus, up=F, n=9)$pathway)
knitr::kable(caption = "plot labels", plot_labels)
plot labels
label pathway
A REACTOME_KEAP1_NFE2L2_PATHWAY
B REACTOME_CELLULAR_RESPONSE_TO_CHEMICAL_STRESS
C REACTOME_GLI3_IS_PROCESSED_TO_GLI3R_BY_THE_PROTEASOME
D REACTOME_ASYMMETRIC_LOCALIZATION_OF_PCP_PROTEINS
E REACTOME_RUNX1_REGULATES_TRANSCRIPTION_OF_GENES_INVOLVED_IN_DIFFERENTIATION_OF_HSCS
F REACTOME_ABC_FAMILY_PROTEINS_MEDIATED_TRANSPORT
G REACTOME_DEGRADATION_OF_DVL
H REACTOME_FCERI_MEDIATED_NF_KB_ACTIVATION
I REACTOME_UBIQUITIN_MEDIATED_DEGRADATION_OF_PHOSPHORYLATED_CDC25A

GSEA table plot

gastroc

top significant pathways:

# creating up and down regulated pathway vectors separately to maintain order

topUp <- get_top_pathways(fgseaRes.gastroc, up=T, pCutoff = params$pCutoff, n=10)
topDown <- get_top_pathways(fgseaRes.gastroc, up=F, pCutoff = params$pCutoff, n=10)
topPathways <- bind_rows(topUp, topDown) %>%
  arrange(-NES) %>%
  pull(pathway)

plotGseaTable(
  pathways = CGP[topPathways],
  stats = ranks.gastroc,
  fgseaRes = fgseaRes.gastroc,
  gseaParam = 0.5,
  render = TRUE
) %>%
  ggpubr::as_ggplot() # needed since, for whatever reason only `NULL` gets returned if `plotGseaTable` is rendered inline

soleus

top significant pathways:

# creating up and down regulated pathway vectors separately to maintain order

topUp <- get_top_pathways(fgseaRes.soleus, up=T, pCutoff = params$pCutoff, n=10)
topDown <- get_top_pathways(fgseaRes.soleus, up=F, pCutoff = params$pCutoff, n=10)
topPathways <- bind_rows(topUp, topDown) %>%
  arrange(-NES) %>%
  pull(pathway)

plotGseaTable(
  pathways = CGP[topPathways],
  stats = ranks.soleus,
  fgseaRes = fgseaRes.soleus,
  gseaParam = 0.5,
  render = TRUE
) %>%
  ggpubr::as_ggplot() # needed since, for whatever reason only `NULL` gets returned if `plotGseaTable` is rendered inline

most differential regulated pathways, both tissues

using NES from the fgsea result filtering on the set pCutoff=0.01 yields the following plot:

pCutoff <- params$pCutoff
res.combined <- merge(
  data.frame(fgseaRes.gastroc[, c("pathway", "NES", "padj")]),
  data.frame(fgseaRes.soleus[, c("pathway", "NES", "padj")]),
  by = "pathway",
  suffixes = c(".ga", ".sol")
) %>%
  filter(padj.ga < pCutoff | padj.sol < pCutoff) %>%
  mutate(
    diff.exp = case_when(
      NES.ga  < 0 & NES.sol < 0 & padj.ga < pCutoff & padj.sol < pCutoff ~ "both down",
      NES.ga  > 0 & NES.sol > 0 & padj.ga < pCutoff & padj.sol < pCutoff ~ "both up",
      NES.ga  < 0 & NES.sol > 0 & padj.ga < pCutoff & padj.sol < pCutoff ~ "ga down, sol up",
      NES.ga  > 0 & NES.sol < 0 & padj.ga < pCutoff & padj.sol < pCutoff ~ "ga up, sol down",
      NES.ga  < 0 &               padj.ga < pCutoff & padj.sol > pCutoff ~ "ga down",
      NES.ga  > 0 &               padj.ga < pCutoff & padj.sol > pCutoff ~ "ga up",
      NES.sol < 0 &               padj.ga > pCutoff & padj.sol < pCutoff ~ "sol down",
      NES.sol > 0 &               padj.ga > pCutoff & padj.sol < pCutoff ~ "sol up",
      TRUE ~ "different"
    )
  )

# final plot
p <- ggplot(res.combined, aes(x = NES.ga, y = NES.sol, text=pathway)) +
  geom_vline(xintercept = 0) + 
  geom_hline(yintercept = 0) + 
  geom_point(aes(color = diff.exp)) +
  # scale_color_manual(values = c("red", "chartreuse1", "bisque", "royalblue")) +
  labs(x = "gastroc", y = "soleus") +
  # ggrepel::geom_label_repel(max.overlaps = 20) + 
  ggtitle(label = "NES")

plotly::ggplotly(p, tooltip = "all")

barplot

ggplot(res.combined, aes(x = diff.exp)) +
  geom_bar(aes(fill = diff.exp))

currentTODOs

[ ] looking at duplicate entrezgene_ids
[ ] finding optimal maxSize (one sided curve)
[ ] find out biological meaning of significant pathways

Questions